# Discretion in Clinical Decision-Making
# Claire Boone, March 2025
# Compare included vs. excluded clinics
# Creates Table A3


# set up -----------------
  library(tidyverse)
  library(dplyr)
  library(magrittr)
  library(broom)
  library(lmtest)
  library(sandwich)
  library(here)
  
  proj_dir <- here::here()
  raw_data <- file.path(proj_dir, "raw_data/")
  gen_data <- file.path(proj_dir, "gen_data/")
  out <- file.path(proj_dir, "out/")
    
# load EHR data ---------------------------
  load(paste0(gen_data,"ehr_cleaned_analysis_sample.Rda"))  
  
  hd_all_visis <- hd
  hd %<>% dplyr::filter(rows_n==1) 
    
# import CASEN 2015 Socioeconomic Survey data for municipality level SES variables ----------------------------
  casen <- read.csv(paste0(gen_data, "casen_micro.csv"), check.names=FALSE)
  hd <- left_join(hd, casen, by = "comuna")
  rm(casen)
  
# cleaning -----------------------
  hd %<>% group_by(cod_est) %<>% arrange(fecha_atencion) %<>%
    dplyr::mutate(rows_clinic = row_number()) %<>% 
    dplyr::mutate(rows_max_clinic = max(rows_clinic)) %<>%
    ungroup()
  
  hd %<>% dplyr::mutate(male = ifelse(male==5, 1, male))
  
  hd %<>% dplyr::mutate(talla = ifelse(talla>213, 213, talla))
  hd %<>% dplyr::mutate(peso = ifelse(peso>200, 200, peso))
  hd$bmi <- (hd$peso)/(((hd$talla)/100)^2)
  hd %<>% dplyr::mutate(bmi = ifelse((is.na(talla) | is.na(peso)), NA, bmi))
  cut_point_top <- quantile(hd$bmi, 0.98, na.rm = T)
  cut_point_bottom <- quantile(hd$bmi, 0.01,  na.rm = T)
  hd %<>% mutate(outlier_top = (bmi >= cut_point_top), 
                 outlier_bottom = (bmi <= cut_point_bottom)) 
  hd$bmi_w <- ifelse(hd$outlier_top, cut_point_top, ifelse(hd$outlier_bottom, cut_point_bottom, hd$bmi))
  hist(hd$bmi_w)
  
  hd %<>% dplyr::mutate(bmi_under = ifelse(bmi_w <18.5, 1, 0)) 
  hd %<>% dplyr::mutate(bmi_normal = ifelse(bmi_w >=18.5 & bmi_w < 25, 1, 0)) 
  hd %<>% dplyr::mutate(bmi_over = ifelse(bmi_w >=25 & bmi_w < 30, 1, 0)) 
  hd %<>% dplyr::mutate(bmi_obese = ifelse(bmi_w >=30, 1, 0)) 
  hd %<>% dplyr::mutate(bmi_obese1 = ifelse(bmi>=30 & bmi<35, 1, 0))
  hd %<>% dplyr::mutate(bmi_obese2 = ifelse(bmi>=35 & bmi<40, 1, 0))
  hd %<>% dplyr::mutate(bmi_obese3 = ifelse(bmi>=40, 1, 0))
  
  cut_point_top <- quantile(hd$circ_cintura, 0.99, na.rm = T)
  cut_point_bottom <- quantile(hd$circ_cintura, 0.01,  na.rm = T)
  hd %<>% mutate(outlier_top = (circ_cintura >= cut_point_top), 
                 outlier_bottom = (circ_cintura <= cut_point_bottom)) 
  hd$circ_cintura_w <- ifelse(hd$outlier_top, cut_point_top, ifelse(hd$outlier_bottom, cut_point_bottom, hd$circ_cintura))
  hist(hd$circ_cintura_w)
  
  hd %<>% dplyr::mutate(waist_obese = ifelse(((male==1 & circ_cintura>=102) | (male==0 & circ_cintura>=88)), 1, 0))
  
  hd %<>% dplyr::mutate(htn0 = ifelse((pa_sys<140|pa_dia<90), 1, 0))
  hd %<>% dplyr::mutate(htn1 = ifelse(((pa_sys>=140 & pa_sys<160)|(pa_dia>=90 & pa_dia<100)), 1, 0))
  hd %<>% dplyr::mutate(htn2 = ifelse(((pa_sys>=160 & pa_sys<180)|(pa_dia>=100 & pa_dia<110)), 1, 0))
  hd %<>% dplyr::mutate(htn3 = ifelse(((pa_sys>=180)|(pa_dia>=110)), 1, 0))
  
  hd %<>% dplyr::mutate(chol0 = ifelse(ldl_mgdl<130, 1, 0))
  hd %<>% dplyr::mutate(chol1 = ifelse(ldl_mgdl>=130 & ldl_mgdl<160, 1, 0))
  hd %<>% dplyr::mutate(chol2 = ifelse(ldl_mgdl>=160 & ldl_mgdl<190, 1, 0))
  hd %<>% dplyr::mutate(chol3 = ifelse(ldl_mgdl>=190, 1, 0))  
  
  hd %<>% dplyr::mutate(sedentarismo = ifelse(is.na(sedentarismo), 0, sedentarismo))
  
  hd$clinic_type <- 0
  hd %<>% dplyr::mutate(clinic_type = ifelse(type_consrural==1, 1, clinic_type))
  hd %<>% dplyr::mutate(clinic_type = ifelse(type_consurb==1, 2, clinic_type))
  hd %<>% dplyr::mutate(clinic_type = ifelse(type_hospbaja==1, 3, clinic_type))
  hd %<>% dplyr::mutate(type_unknown= ifelse(clinic_type==0, 1, 0))
  
  hd$log_visits <- log(hd$rows_max_clinic)
  
  hd$ypch_usd2015 <- hd$ypch_mean / 711.03
  hd$ypch_usd2021 <- hd$ypch_usd2015 * 1.11
  
  hd$income <- log(hd$ypch_usd2021)
  hd$employed <- hd$activ_1_mean
  hd$primary_edu <- hd$primary
  hd$secondary_edu <- hd$secondary
  hd$tertiary_edu <- hd$tertiary
  hd$secondary_or_less <- hd$secondary + hd$primary
  hd$literacy <- hd$e1_1_mean
  hd %<>% dplyr::mutate(literacy = ifelse(hd$literacy>=0.97, 1, 0))
  hd$mean_age <- hd$edad_mean
  
  # clinics are eligible for inclusion if they have above the median number of patient-visits (1265):
  hd %<>% dplyr::mutate(included_clinic = ifelse(rows_max_clinic>=1265, 1, 0))
  table(hd[hd$rows_clinic==1,]$included_clinic) 
  
# sample exclusions (regardless of size of clinic, we do not consider minors or patients >80) ----
  hd %<>% dplyr::filter(age_at_visit<80 & age_at_visit>=18)

  hd %<>% group_by(cod_est) %<>% arrange(fecha_atencion) %<>%
    dplyr::mutate(rows_clinic = row_number()) %<>% 
    dplyr::mutate(rows_max_clinic = max(rows_clinic)) %<>%
    ungroup()
  table(hd[hd$rows_clinic==1,]$included_clinic) 

  hd %<>% dplyr::mutate(followup = as.numeric(as.Date("2018-12-31") - fecha_atencion))
  hd$followup_yrs <- hd$followup/365.25
  
  hd %<>% dplyr::filter(followup_yrs>=1) 
  
  hd %<>% group_by(cod_est) %<>% arrange(fecha_atencion) %<>%
    dplyr::mutate(rows_clinic = row_number()) %<>% 
    dplyr::mutate(rows_max_clinic = max(rows_clinic)) %<>%
    ungroup()
  table(hd[hd$rows_clinic==1,]$included_clinic) 
  
  # additional exclusions: clinics where i am not able to estimate the mangnitude of bunching (computation breaks)
  break_loops <- c(109303, 115305, 104316, 107317, 114334)
  hd %<>% dplyr::mutate(included_clinic = ifelse(cod_est %in% break_loops, 0, included_clinic))
  table(hd[hd$rows_clinic==1,]$included_clinic) 
  
   
# (TAB A3) -----
  
  selected_vars <- c("male", "age_at_visit", "diag_dm10", 
                     "bmi_normal", "bmi_over", "bmi_obese", "waist_obese",  "chol0", "chol1", 
                     "chol2", "chol3", "sedentarismo", "fonasa_a", "fonasa_b", "fonasa_c", "fonasa_d",
                     'income',  'mean_age', 'secondary_or_less', 'tertiary_edu',  'rural_mean', 'log_visits',
                     'type_consurb', 'type_consrural', 'type_hospbaja')
  
  sd <- hd
  sd %<>% dplyr::select(included_clinic, cod_est, all_of(selected_vars)) 
  sd %<>% group_by(cod_est, included_clinic) %<>% summarise_all(~mean(.x, na.rm = TRUE)) 
  
  balance_table <- sd %>%
    select(included_clinic, all_of(selected_vars), -cod_est) %>%
    group_by(included_clinic) %>%
    summarise_all(~mean(.x, na.rm = TRUE)) %>%
    pivot_longer(-included_clinic, names_to = "variable", values_to = "mean") %>%
    pivot_wider(names_from = included_clinic, values_from = mean, 
                names_prefix = "clinic_") %>%
    mutate(mean_diff = clinic_1 - clinic_0)
  
  balance_table$p_value <- sapply(balance_table$variable, function(var) {
    t.test(reformulate('included_clinic', response = var), data = sd)$p.value
  })
  
  balance_table <- balance_table[2:nrow(balance_table),]
  
  n_cod_est <- sd %>%
    group_by(included_clinic) %>%
    summarise(n_cod_est = n_distinct(cod_est)) %>%
    pivot_wider(names_from = included_clinic, values_from = n_cod_est, 
                names_prefix = "clinic_") %>%
    mutate(variable = "N Clinics", mean_diff = NA, p_value = NA)
  
  balance_table <- balance_table %>%
    bind_rows(n_cod_est) %>%
    select(variable, clinic_0, clinic_1, mean_diff, p_value)
  
  names(balance_table) <- c("Variable", "Excluded Clinics", "Included Clinics", "Mean Diff.", "p-val")
  
  balance_table %<>% dplyr::mutate(Variable = ifelse(Variable=="male", "Male", Variable),
                                   Variable = ifelse(Variable=="age_at_visit", "Age", Variable),
                                   Variable = ifelse(Variable=="diag_dm10", "Type 2 Diabetes", Variable),
                                   Variable = ifelse(Variable=="bmi_normal", "BMI: Normal", Variable),
                                   Variable = ifelse(Variable=="bmi_over", "BMI: Overweight", Variable),
                                   Variable = ifelse(Variable=="bmi_obese", "BMI: Obese", Variable),
                                   Variable = ifelse(Variable=="waist_obese", "Waist: Obese", Variable),
                                   Variable = ifelse(Variable=="chol0", "Cholesterol: normal", Variable),
                                   Variable = ifelse(Variable=="chol1", "Cholesterol: high level 1", Variable),
                                   Variable = ifelse(Variable=="chol2", "Cholesterol: high level 2", Variable),
                                   Variable = ifelse(Variable=="chol3", "Cholesterol: high level 3", Variable),
                                   Variable = ifelse(Variable=="sedentarismo", "Sedentary", Variable),
                                   Variable = ifelse(Variable=="fonasa_a", "Fonasa A", Variable),
                                   Variable = ifelse(Variable=="fonasa_b", "Fonasa B", Variable),
                                   Variable = ifelse(Variable=="fonasa_c", "Fonasa C", Variable),
                                   Variable = ifelse(Variable=="fonasa_d", "Fonasa D", Variable),
                                   Variable = ifelse(Variable=="income", "Municipality mean log income", Variable),
                                   Variable = ifelse(Variable=="mean_age", "Municipality mean age", Variable),
                                   Variable = ifelse(Variable=="secondary_or_less", "Municipality share secondary edu. or less", Variable),
                                   Variable = ifelse(Variable=="tertiary_edu", "Municipality share tertiary edu. or more", Variable),
                                   Variable = ifelse(Variable=="rural_mean", "Municipality share rural", Variable),
                                   Variable = ifelse(Variable=="log_visits", "Log visits at clinic", Variable),
                                   Variable = ifelse(Variable=="type_consurb", "Urban clinic", Variable),
                                   Variable = ifelse(Variable=="type_consrural", "Rural clinic", Variable),
                                   Variable = ifelse(Variable=="type_hospbaja", "Low complexity hospital", Variable))

  
  print(xtable(balance_table), type = "latex",  include.rownames=FALSE, file  = paste0(leaf, "balance_excluded_xtable.tex"))
  

  # do not save this version of the data
 